The objective is to classify user with respect to the label attribute, based on the other attributes in the dataset.
(Details and visualization are embedded below)
Handling missing values:
Analysing distributions:
web_site - the attribute had 2 categories that represented 52% of the examples with 0 occurrence and high cordinality.
user_identifier - has a very long tail distribution
request_time - spans over 2 months during 2019 with additive seasonality during weekends
browser, os and state - dosn't have high cordinality. they seem too add more data with various distributions.
Target attribute:
Feature representation
web_site - 2 large categories with no clicks were filtered in order to help the model. and with less at 1K occurrences categories were factorized into a single category
user_identifier - the entire dataset was aggregated by this attribute it could also been embaded with other attributes
request_time - was split into multiple attributes: month, day, week name, hour
All feature were transformed into their one-hot representation after user_identifier aggrigation
Fitting a baseline model:
Fitting multiple algorithms' models
Assumption:
objective function:
Choosing the right metric:
from __future__ import print_function
import os
import pickle
import numpy as np
import pandas as pd
import re
from sklearn import preprocessing
from sklearn.metrics import recall_score, precision_score, f1_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from xgboost import XGBClassifier
import plotly.express as px
from datetime import datetime
get_ipython().run_line_magic('matplotlib', 'inline')
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 999)
df = pd.read_csv('exam_exam_ds.csv')
print(df.info())
number_of_clicks = df['label'].sum()
cur_profit = number_of_clicks * 60 - df['label'].count()
print('There are {} viewes of ads with {} of clicks and a total loss of {}'.format(df['label'].count(), number_of_clicks, cur_profit))
def count_val(x_axis, order_by=False):
tmp = df.groupby(x_axis)['label'].agg(['count', 'mean']).reset_index()
if order_by:
fig = px.scatter(tmp, x=x_axis, y="count", size='mean').update_xaxes(categoryorder="total descending")
else:
fig = px.scatter(tmp, x=x_axis, y="count", size='mean')
fig.show()
def dateparser(i):
parsed_date = re.split('\.|\s[A-z]', i)
time_stamp = datetime.strptime(parsed_date[0], '%Y-%m-%d %H:%M:%S')
pd_date = pd.to_datetime(time_stamp)
return pd.Series({'date':pd_date.strftime('%d/%m/%Y'),
'month': pd_date.month,
'day': pd_date.day,
'weekday': pd_date.day_name(),
'hour': pd_date.strftime('%H'),
'time': pd_date.time()
})
def merge_classes(column, min_records, encode=True):
counts = column.value_counts()
merged_column = column.copy()
merged_column.loc[counts[column].values < min_records] = 'other'
print('Number of classes in {} after merge: {}'.format(column._name, len(merged_column.value_counts())))
if encode:
encoder = preprocessing.LabelEncoder()
merged_column = encoder.fit_transform(merged_column)
return merged_column
df_dates = df['request_time'].apply(dateparser)
df = pd.concat([df, df_dates], axis=1)
count_val("web_site", True)
df = df[~df['web_site'].isin(['hdpopcorns.co', 'cyberreel.com'])]
the threshold of 1000+ was chosen mainly for computational reasons
df['web_site_cat_1000'] = merge_classes(df["web_site"], 1000, False)
count_val('browser', True)
count_val('date')
count_val('weekday')
cols = df.columns.drop(['request_time', 'label', 'time', 'user_identifier', 'date', 'web_site'])
df[cols].nunique()
df_agg = df.groupby(['user_identifier', 'label'], as_index=False)[cols].agg(lambda x: list(x))
df_agg.head()
df_agg.shape
df_agg.sample(10)
from sklearn.preprocessing import MultiLabelBinarizer
def to_one_hot(col,df):
mlb = MultiLabelBinarizer()
res = pd.DataFrame(mlb.fit_transform(df[col]),
columns=['{}_'.format(col) + str(x) for x in mlb.classes_],
index=df.user_identifier).max(level=0)
return res
df_onehot = df_agg
for i in cols:
df_onehot = df_onehot.merge(to_one_hot(i, df_agg), on='user_identifier')
df_final = df_onehot.drop(cols, axis=1)
df_final.shape
df_final.sample(10)
y = df_final['label']
X = df_final.drop(['label', 'user_identifier'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)
print("Train set data shape: {} \ny_train labeled data shape: {}".format(X_train.shape, y_train.sum()))
def evaluate_with_price(x, y_true, clf, threshold=0.0164):
y_pred = clf.predict_proba(x)
tmp = pd.DataFrame(np.column_stack([y_pred, y_true.to_numpy()]), columns=['pred_0', 'pred_1', 'true'])
tmp['pred_value'] = np.where(tmp['pred_1']>threshold, 1, 0)
tmp['expected_value'] = 0
tmp.loc[(tmp['pred_value'] ==1) & (tmp['true']==0), 'expected_value'] = -1
tmp.loc[(tmp['pred_value'] ==1) & (tmp['true']==1), 'expected_value'] = 60
print('Score after tranformation \nRecall: {:.4}\nPrecision: {:.4}\nF1: {:.4}'.format(recall_score(y_true, tmp['pred_value'].values),
precision_score(y_true, tmp['pred_value'].values),
f1_score(y_true, tmp['pred_value'].values)))
cur_profit = tmp['expected_value'].sum()
print('There were {:,} clicks on ads with predicted {:,} clicks and a total profit of {:,}'.format(tmp['true'].sum(),
tmp['pred_value'].sum(),
cur_profit))
return tmp
def plot_pred_hist(axis='pred_1'):
fig = px.histogram(results, x=axis, color="true", histnorm='probability density')
fig.show()
Model training:
eval_set = [(X_train, y_train), (X_test, y_test)]
ratio = (y_train==False).sum()/(y_train==True).sum()
early_stopping_rounds = 10
clf = XGBClassifier(
eval_metric = "logloss",
objective='binary:logistic',
random_state=1,
scale_pos_weight = ratio
)
params = {
'min_child_weight': [0.5, 1, 6],
'learning_rate':[0.1, 0.3, 1, 5],
'n_estimators':[300, 400, 600]
}
scoring = ['roc_auc', 'f1', 'recall', 'precision']
GsCv = RandomizedSearchCV(clf,
params,
n_iter=4,
cv=3,
scoring = scoring,
refit = 'f1').fit(X_train, y_train,
eval_set=eval_set,
early_stopping_rounds=early_stopping_rounds,
verbose=100)
print(GsCv.best_params_)
pd.DataFrame(GsCv.cv_results_).sort_values('rank_test_'+GsCv.refit).filter(regex='mean_|param_',axis=1)
bst = GsCv.best_estimator_.fit(X_train, y_train,
eval_set=eval_set,
early_stopping_rounds=early_stopping_rounds,
verbose=False)
results = evaluate_with_price(X_test, y_test, bst)
plot_pred_hist()
results = evaluate_with_price(X_test, y_test, bst, 0.7)
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_jobs=-1,
n_estimators=100,
min_samples_leaf=2,
max_features=None,
class_weight='balanced',
verbose=False).fit(X_train, y_train)
results = evaluate_with_price(X_test, y_test, rf)
plot_pred_hist()
when we compare the models, they seem to have similar results